e8511041a942b968fa5ebae67228b39d9db11de1,src/main/java/com/datumbox/examples/Regression.java,Regression,main,#String[]#,48
Before Change
//-----------------
//Normalize continuous variables
XYMinMaxNormalizer dataTransformer = new XYMinMaxNormalizer("LaborStatistics", conf);
dataTransformer.fit_transform(trainingDataframe, new XYMinMaxNormalizer.TrainingParameters());
//Feature Selection
//-----------------
//Perform dimensionality reduction using PCA
PCA featureSelection = new PCA("LaborStatistics", conf);
PCA.TrainingParameters featureSelectionParameters = new PCA.TrainingParameters();
featureSelectionParameters.setMaxDimensions(trainingDataframe.xColumnSize()-1); //remove one dimension
featureSelectionParameters.setWhitened(false);
After Change
//Initialization
//--------------
RandomGenerator.setGlobalSeed(42L); //optionally set a specific seed for all Random objects
Configuration conf = Configuration.getConfiguration(); //default configuration based on properties file
//conf.setDbConfig(new InMemoryConfiguration()); //use In-Memory storage (default)
//conf.setDbConfig(new MapDBConfiguration()); //use MapDB storage
//conf.getConcurrencyConfig().setParallelized(true); //turn on/off the parallelization
//conf.getConcurrencyConfig().setMaxNumberOfThreadsPerTask(4); //set the concurrency level
//Reading Data
//------------
Dataframe trainingDataframe;
try (Reader fileReader = new InputStreamReader(new FileInputStream(Paths.get(Clustering.class.getClassLoader().getResource("datasets/labor-statistics/longley.csv").toURI()).toFile()), "UTF-8")) {
LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>();
headerDataTypes.put("Employed", TypeInference.DataType.NUMERICAL);
headerDataTypes.put("GNP.deflator", TypeInference.DataType.NUMERICAL);
headerDataTypes.put("GNP", TypeInference.DataType.NUMERICAL);
headerDataTypes.put("Unemployed", TypeInference.DataType.NUMERICAL);
headerDataTypes.put("Armed.Forces", TypeInference.DataType.NUMERICAL);
headerDataTypes.put("Population", TypeInference.DataType.NUMERICAL);
headerDataTypes.put("Year", TypeInference.DataType.NUMERICAL);
trainingDataframe = Dataframe.Builder.parseCSVFile(fileReader, "Employed", headerDataTypes, ',', '"', "\r\n", null, null, conf);
}
catch(UncheckedIOException | IOException | URISyntaxException ex) {
throw new RuntimeException(ex);
}
Dataframe testingDataframe = trainingDataframe.copy();
//Transform Dataframe
//-----------------
//Normalize continuous variables
XYMinMaxNormalizer dataTransformer = MLBuilder.create(new XYMinMaxNormalizer.TrainingParameters(), conf);
dataTransformer.fit_transform(trainingDataframe);
dataTransformer.save("LaborStatistics");
//Feature Selection
//-----------------
//Perform dimensionality reduction using PCA
PCA.TrainingParameters featureSelectionParameters = new PCA.TrainingParameters();
featureSelectionParameters.setMaxDimensions(trainingDataframe.xColumnSize()-1); //remove one dimension
featureSelectionParameters.setWhitened(false);
featureSelectionParameters.setVariancePercentageThreshold(0.99999995);
PCA featureSelection = MLBuilder.create(featureSelectionParameters, conf);
featureSelection.fit_transform(trainingDataframe);
featureSelection.save("LaborStatistics");
//Fit the regressor
//-----------------
MatrixLinearRegression.TrainingParameters param = new MatrixLinearRegression.TrainingParameters();
MatrixLinearRegression regressor = MLBuilder.create(param, conf);
regressor.fit(trainingDataframe);
regressor.save("LaborStatistics");
regressor.close(); //close the regressor, we will use it again later
//Denormalize trainingDataframe (optional)